import traceback

from selenium.webdriver.chrome.webdriver import WebDriver

import mysql_connect
from selenium import webdriver
import os
import time
import re
from bs4 import BeautifulSoup
import random
from selenium.webdriver.common.by import By
from collections import Counter

"""
    提取问题和文章的网址和标题
"""


class ZhihuQuestion():
    def __init__(self, url, title, display_text, zhihu_data_id):
        self.url = url
        self.title = title
        self.display_text = display_text
        self.zhihu_data_id = zhihu_data_id

    def __str__(self):
        return f"ZhihuQuestion(url={self.url}, title={self.title}, display_text={self.display_text}), zhihu_data_id={self.zhihu_data_id}"

    def __repr__(self):
        return f"ZhihuQuestion(url={self.url}, title={self.title}, display_text={self.display_text}), zhihu_data_id={self.zhihu_data_id}"


class ZhihuEssay():
    def __init__(self, url, title, display_text, zhihu_data_id):
        self.url = url
        self.title = title
        self.display_text = display_text
        self.zhihu_data_id = zhihu_data_id

    def __str__(self):
        return f"ZhihuEssay(url={self.url}, title={self.title}, display_text={self.display_text}), zhihu_data_id={self.zhihu_data_id}"

    def __repr__(self):
        return f"ZhihuEssay(url={self.url}, title={self.title}, display_text={self.display_text}), zhihu_data_id={self.zhihu_data_id}"


def get_answers_list():
    file_name_list: list[str] = os.listdir("other_files/top_answers_list_htmls")
    return file_name_list


def extract_data_one(file_name):
    base_dir: str = "other_files/top_answers_list_htmls/"
    html_text = ""
    with open(f"{base_dir}{file_name}", "r", encoding="utf-8") as r_file:
        html_text = r_file.read()
    zhihu_data_id = file_name.split(".")[0]
    soup = BeautifulSoup(html_text, "html.parser")
    zhihu_question_list = get_question_info(soup, zhihu_data_id)
    zhihu_essay_list = get_essay_info(soup, zhihu_data_id)

    # print(zhihu_question_list)
    return zhihu_question_list, zhihu_essay_list


def get_question_info(soup: BeautifulSoup, zhihu_data_id: str):
    soup_div_question_list = soup.find_all("div", {"itemprop": "zhihu:question"})
    # print(soup_div_question_list)
    zhihu_question_list: list[ZhihuQuestion] = []
    for soup_question in soup_div_question_list:
        question_url = soup_question.find("meta", {"itemprop": "url"}).get("content")
        question_title = soup_question.find("meta", {"itemprop": "name"}).get("content")
        zhihu_question = ZhihuQuestion(question_url, question_title, None, zhihu_data_id)
        zhihu_question_list.append(zhihu_question)

    # print(zhihu_question_list)
    return zhihu_question_list


def get_essay_info(soup: BeautifulSoup, zhihu_data_id: str):
    # soup_h2 = soup.find_all("h2", {"class": "ContentItem-title"})
    # print(soup_h2)
    soup_a_list = soup.find_all("a", {"rel": "noopener noreferrer", "data-za-detail-view-element_name": "Title"})
    # print(soup_a_list)
    zhihu_essay_list: list[ZhihuEssay] = []
    for soup_a in soup_a_list:
        essay_url = soup_a.get("href")
        essay_title = soup_a.text
        zhihu_essay = ZhihuEssay(essay_url, essay_title, None, zhihu_data_id)
        zhihu_essay_list.append(zhihu_essay)
    return zhihu_essay_list


def save_in_csv(zhihu_question_all_list: list[ZhihuQuestion], zhihu_essay_all_list: list[ZhihuEssay]):
    zhihu_question_dict_all_list = [dict(zhihu_question.__dict__) for zhihu_question in zhihu_question_all_list]
    print("转换question为字典完毕")
    with open(f"other_files/question_info/question_info.csv", "w", encoding="utf-8") as question_file:
        question_file.write(str(zhihu_question_dict_all_list))
    print("存储question完毕")
    zhihu_essay_dict_all_list = [dict(zhihu_essay.__dict__) for zhihu_essay in zhihu_essay_all_list]
    print("转换essay为字典完毕")
    with open(f"other_files/essay_info/essay_info.csv", "w", encoding="utf-8") as essay_file:
        essay_file.write(str(zhihu_essay_dict_all_list))
    print("存储essay完毕")


def extract_data():
    file_name_list: list[str] = get_answers_list()
    zhihu_question_all_list: list[ZhihuQuestion] = []
    zhihu_essay_all_list: list[ZhihuEssay] = []
    for index, file_name in enumerate(file_name_list):
        print(f"正在提取的是{file_name},第{index}个,总共{len(file_name_list)}个")
        zhihu_question_list, zhihu_essay_list = extract_data_one(file_name)
        zhihu_question_all_list.extend(zhihu_question_list)
        zhihu_essay_all_list.extend(zhihu_essay_list)
        # break
    # print(zhihu_question_list)
    duplicate_count = Counter(zhihu_question_list)
    for question, count in duplicate_count.items():
        if count > 1:
            print(f"重复的ZhihuQuestion: {question}, 重复数量: {count}")

    save_in_csv(zhihu_question_all_list, zhihu_essay_all_list)


if __name__ == "__main__":
    extract_data()
